import pandas as panda
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
panda.set_option("display.max_columns", 57)
colums = []
with open("columns.txt", "r") as doc:
lines = doc.readlines()
for line in lines:
colums.append(line.split(":")[0])
df = panda.read_csv("spambase.data", names=colums)
df.describe()
| word_freq_make | word_freq_address | word_freq_all | word_freq_3d | word_freq_our | word_freq_over | word_freq_remove | word_freq_internet | word_freq_order | word_freq_mail | word_freq_receive | word_freq_will | word_freq_people | word_freq_report | word_freq_addresses | word_freq_free | word_freq_business | word_freq_email | word_freq_you | word_freq_credit | word_freq_your | word_freq_font | word_freq_000 | word_freq_money | word_freq_hp | word_freq_hpl | word_freq_george | word_freq_650 | word_freq_lab | word_freq_labs | word_freq_telnet | word_freq_857 | word_freq_data | word_freq_415 | word_freq_85 | word_freq_technology | word_freq_1999 | word_freq_parts | word_freq_pm | word_freq_direct | word_freq_cs | word_freq_meeting | word_freq_original | word_freq_project | word_freq_re | word_freq_edu | word_freq_table | word_freq_conference | char_freq_; | char_freq_( | char_freq_[ | char_freq_! | char_freq_$ | char_freq_# | capital_run_length_average | capital_run_length_longest | capital_run_length_total | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 | 4601.000000 |
| mean | 0.104553 | 0.213015 | 0.280656 | 0.065425 | 0.312223 | 0.095901 | 0.114208 | 0.105295 | 0.090067 | 0.239413 | 0.059824 | 0.541702 | 0.093930 | 0.058626 | 0.049205 | 0.248848 | 0.142586 | 0.184745 | 1.662100 | 0.085577 | 0.809761 | 0.121202 | 0.101645 | 0.094269 | 0.549504 | 0.265384 | 0.767305 | 0.124845 | 0.098915 | 0.102852 | 0.064753 | 0.047048 | 0.097229 | 0.047835 | 0.105412 | 0.097477 | 0.136953 | 0.013201 | 0.078629 | 0.064834 | 0.043667 | 0.132339 | 0.046099 | 0.079196 | 0.301224 | 0.179824 | 0.005444 | 0.031869 | 0.038575 | 0.139030 | 0.016976 | 0.269071 | 0.075811 | 0.044238 | 5.191515 | 52.172789 | 283.289285 | 0.394045 |
| std | 0.305358 | 1.290575 | 0.504143 | 1.395151 | 0.672513 | 0.273824 | 0.391441 | 0.401071 | 0.278616 | 0.644755 | 0.201545 | 0.861698 | 0.301036 | 0.335184 | 0.258843 | 0.825792 | 0.444055 | 0.531122 | 1.775481 | 0.509767 | 1.200810 | 1.025756 | 0.350286 | 0.442636 | 1.671349 | 0.886955 | 3.367292 | 0.538576 | 0.593327 | 0.456682 | 0.403393 | 0.328559 | 0.555907 | 0.329445 | 0.532260 | 0.402623 | 0.423451 | 0.220651 | 0.434672 | 0.349916 | 0.361205 | 0.766819 | 0.223812 | 0.621976 | 1.011687 | 0.911119 | 0.076274 | 0.285735 | 0.243471 | 0.270355 | 0.109394 | 0.815672 | 0.245882 | 0.429342 | 31.729449 | 194.891310 | 606.347851 | 0.488698 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.588000 | 6.000000 | 35.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.100000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.310000 | 0.000000 | 0.220000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.065000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.276000 | 15.000000 | 95.000000 | 0.000000 |
| 75% | 0.000000 | 0.000000 | 0.420000 | 0.000000 | 0.380000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.160000 | 0.000000 | 0.800000 | 0.000000 | 0.000000 | 0.000000 | 0.100000 | 0.000000 | 0.000000 | 2.640000 | 0.000000 | 1.270000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.110000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.188000 | 0.000000 | 0.315000 | 0.052000 | 0.000000 | 3.706000 | 43.000000 | 266.000000 | 1.000000 |
| max | 4.540000 | 14.280000 | 5.100000 | 42.810000 | 10.000000 | 5.880000 | 7.270000 | 11.110000 | 5.260000 | 18.180000 | 2.610000 | 9.670000 | 5.550000 | 10.000000 | 4.410000 | 20.000000 | 7.140000 | 9.090000 | 18.750000 | 18.180000 | 11.110000 | 17.100000 | 5.450000 | 12.500000 | 20.830000 | 16.660000 | 33.330000 | 9.090000 | 14.280000 | 5.880000 | 12.500000 | 4.760000 | 18.180000 | 4.760000 | 20.000000 | 7.690000 | 6.890000 | 8.330000 | 11.110000 | 4.760000 | 7.140000 | 14.280000 | 3.570000 | 20.000000 | 21.420000 | 22.050000 | 2.170000 | 10.000000 | 4.385000 | 9.752000 | 4.081000 | 32.478000 | 6.003000 | 19.829000 | 1102.500000 | 9989.000000 | 15841.000000 | 1.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4601 entries, 0 to 4600 Data columns (total 58 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 word_freq_make 4601 non-null float64 1 word_freq_address 4601 non-null float64 2 word_freq_all 4601 non-null float64 3 word_freq_3d 4601 non-null float64 4 word_freq_our 4601 non-null float64 5 word_freq_over 4601 non-null float64 6 word_freq_remove 4601 non-null float64 7 word_freq_internet 4601 non-null float64 8 word_freq_order 4601 non-null float64 9 word_freq_mail 4601 non-null float64 10 word_freq_receive 4601 non-null float64 11 word_freq_will 4601 non-null float64 12 word_freq_people 4601 non-null float64 13 word_freq_report 4601 non-null float64 14 word_freq_addresses 4601 non-null float64 15 word_freq_free 4601 non-null float64 16 word_freq_business 4601 non-null float64 17 word_freq_email 4601 non-null float64 18 word_freq_you 4601 non-null float64 19 word_freq_credit 4601 non-null float64 20 word_freq_your 4601 non-null float64 21 word_freq_font 4601 non-null float64 22 word_freq_000 4601 non-null float64 23 word_freq_money 4601 non-null float64 24 word_freq_hp 4601 non-null float64 25 word_freq_hpl 4601 non-null float64 26 word_freq_george 4601 non-null float64 27 word_freq_650 4601 non-null float64 28 word_freq_lab 4601 non-null float64 29 word_freq_labs 4601 non-null float64 30 word_freq_telnet 4601 non-null float64 31 word_freq_857 4601 non-null float64 32 word_freq_data 4601 non-null float64 33 word_freq_415 4601 non-null float64 34 word_freq_85 4601 non-null float64 35 word_freq_technology 4601 non-null float64 36 word_freq_1999 4601 non-null float64 37 word_freq_parts 4601 non-null float64 38 word_freq_pm 4601 non-null float64 39 word_freq_direct 4601 non-null float64 40 word_freq_cs 4601 non-null float64 41 word_freq_meeting 4601 non-null float64 42 word_freq_original 4601 non-null float64 43 word_freq_project 4601 non-null float64 44 word_freq_re 4601 non-null float64 45 word_freq_edu 4601 non-null float64 46 word_freq_table 4601 non-null float64 47 word_freq_conference 4601 non-null float64 48 char_freq_; 4601 non-null float64 49 char_freq_( 4601 non-null float64 50 char_freq_[ 4601 non-null float64 51 char_freq_! 4601 non-null float64 52 char_freq_$ 4601 non-null float64 53 char_freq_# 4601 non-null float64 54 capital_run_length_average 4601 non-null float64 55 capital_run_length_longest 4601 non-null int64 56 capital_run_length_total 4601 non-null int64 57 class 4601 non-null int64 dtypes: float64(55), int64(3) memory usage: 2.0 MB
df.isnull().sum()
word_freq_make 0 word_freq_address 0 word_freq_all 0 word_freq_3d 0 word_freq_our 0 word_freq_over 0 word_freq_remove 0 word_freq_internet 0 word_freq_order 0 word_freq_mail 0 word_freq_receive 0 word_freq_will 0 word_freq_people 0 word_freq_report 0 word_freq_addresses 0 word_freq_free 0 word_freq_business 0 word_freq_email 0 word_freq_you 0 word_freq_credit 0 word_freq_your 0 word_freq_font 0 word_freq_000 0 word_freq_money 0 word_freq_hp 0 word_freq_hpl 0 word_freq_george 0 word_freq_650 0 word_freq_lab 0 word_freq_labs 0 word_freq_telnet 0 word_freq_857 0 word_freq_data 0 word_freq_415 0 word_freq_85 0 word_freq_technology 0 word_freq_1999 0 word_freq_parts 0 word_freq_pm 0 word_freq_direct 0 word_freq_cs 0 word_freq_meeting 0 word_freq_original 0 word_freq_project 0 word_freq_re 0 word_freq_edu 0 word_freq_table 0 word_freq_conference 0 char_freq_; 0 char_freq_( 0 char_freq_[ 0 char_freq_! 0 char_freq_$ 0 char_freq_# 0 capital_run_length_average 0 capital_run_length_longest 0 capital_run_length_total 0 class 0 dtype: int64
correlation = df.corr()
plt.figure(figsize=(20,20))
matrix=np.triu(correlation, k=1)
ax = sns.heatmap(
correlation,
xticklabels=correlation.columns,
yticklabels=correlation.columns,
cmap="coolwarm",
square=True,
linewidths=0.1,
mask=matrix
)
ax.set(title="all variables correlation heatmap")
[Text(0.5, 1.0, 'all variables correlation heatmap')]
#top correlation with class (negativ and positiv)
k=20 #number of variables
cols=correlation.abs().nlargest(k,'class')['class'].index
cm=np.corrcoef(df[cols].values.T)
plt.figure(figsize=(15, 15))
ax=sns.heatmap(cm,yticklabels=cols.values,
xticklabels=cols.values,
cmap="coolwarm",
annot=True,
square=True,
fmt=".2f")
ax.set(title=f"top {k} variables correlation with class heatmap");
plt.show()
Some correlations between the main class (spam, !spam) and the first 20 variables
sns.pairplot(data=df[cols[:5].values], hue="class")
<seaborn.axisgrid.PairGrid at 0x137f4f5b0>
pivot_class_freq=panda.pivot_table(df,values=df.drop(["class","capital_run_length_average","capital_run_length_longest","capital_run_length_total"],axis=1),index='class',aggfunc="mean")
pivot_class_freq
| char_freq_! | char_freq_# | char_freq_$ | char_freq_( | char_freq_; | char_freq_[ | word_freq_000 | word_freq_1999 | word_freq_3d | word_freq_415 | word_freq_650 | word_freq_85 | word_freq_857 | word_freq_address | word_freq_addresses | word_freq_all | word_freq_business | word_freq_conference | word_freq_credit | word_freq_cs | word_freq_data | word_freq_direct | word_freq_edu | word_freq_email | word_freq_font | word_freq_free | word_freq_george | word_freq_hp | word_freq_hpl | word_freq_internet | word_freq_lab | word_freq_labs | word_freq_mail | word_freq_make | word_freq_meeting | word_freq_money | word_freq_order | word_freq_original | word_freq_our | word_freq_over | word_freq_parts | word_freq_people | word_freq_pm | word_freq_project | word_freq_re | word_freq_receive | word_freq_remove | word_freq_report | word_freq_table | word_freq_technology | word_freq_telnet | word_freq_will | word_freq_you | word_freq_your | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| class | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 0 | 0.109984 | 0.021713 | 0.011648 | 0.158578 | 0.050281 | 0.022684 | 0.007088 | 0.197744 | 0.000886 | 0.077787 | 0.193806 | 0.169455 | 0.077306 | 0.244466 | 0.008318 | 0.200581 | 0.048346 | 0.051227 | 0.007579 | 0.072027 | 0.150986 | 0.083117 | 0.287184 | 0.097292 | 0.045226 | 0.073587 | 1.265265 | 0.895473 | 0.431994 | 0.038415 | 0.162794 | 0.165854 | 0.167170 | 0.073479 | 0.216808 | 0.017138 | 0.038049 | 0.070581 | 0.181040 | 0.044544 | 0.018723 | 0.061664 | 0.121679 | 0.126636 | 0.415760 | 0.021711 | 0.009383 | 0.042403 | 0.008192 | 0.141671 | 0.106033 | 0.536324 | 1.270341 | 0.438702 |
| 1 | 0.513713 | 0.078877 | 0.174478 | 0.108970 | 0.020573 | 0.008199 | 0.247055 | 0.043469 | 0.164672 | 0.001776 | 0.018798 | 0.006928 | 0.000518 | 0.164650 | 0.112079 | 0.403795 | 0.287507 | 0.002101 | 0.205521 | 0.000055 | 0.014562 | 0.036718 | 0.014727 | 0.319228 | 0.238036 | 0.518362 | 0.001550 | 0.017479 | 0.009173 | 0.208141 | 0.000684 | 0.005968 | 0.350507 | 0.152339 | 0.002443 | 0.212879 | 0.170061 | 0.008450 | 0.513955 | 0.174876 | 0.004710 | 0.143547 | 0.012427 | 0.006244 | 0.125091 | 0.118434 | 0.275405 | 0.083574 | 0.001219 | 0.029515 | 0.001274 | 0.549972 | 2.264539 | 1.380370 |
plt.figure(figsize=(16,8))
plt.xticks(rotation=70)
sns.barplot(x=pivot_class_freq.columns, y=pivot_class_freq.iloc[0] - pivot_class_freq.iloc[1]).set(title="Non Spam minus Spam average word and char frequency", ylabel="Average frequency difference")
[Text(0.5, 1.0, 'Non Spam minus Spam average word and char frequency'), Text(0, 0.5, 'Average frequency difference')]
The words "you" and "your" are fare more frequent in spam emails than in !spam.
The words "free", "george" and "hp" are fare more frequent in !spam emails than in spam.
From the dataset description we know that the word "George" (and "650" as well, but is not significant) is not spam. So, the spammer don't know the victim name and call him "you" instead of his real name.
pivot_class_cap=panda.pivot_table(df,values=df[["capital_run_length_average","capital_run_length_longest","capital_run_length_total"]],index="class")
pivot_class_cap
| capital_run_length_average | capital_run_length_longest | capital_run_length_total | |
|---|---|---|---|
| class | |||
| 0 | 2.377301 | 18.214491 | 161.470947 |
| 1 | 9.519165 | 104.393271 | 470.619415 |
plt.xticks(rotation=10)
sns.barplot(x=pivot_class_cap.columns,
y=pivot_class_cap.iloc[1]-pivot_class_cap.iloc[0]).set(title="Spam minus !Spam",ylabel="average difference")
[Text(0.5, 1.0, 'Spam minus !Spam'), Text(0, 0.5, 'average difference')]
Spam emails have:
Capital letters are far used in spam emails, the more frequent they are, the more probably is spam
So, the spammer prefers to use capital letters to focus the victim attention on specific words, to scare him and rush him to click on the fake link in the email.